This is an EDA for the kaggle competition House Prices: Advanced Regression Techniques.
#install.packages("moments")
library(ggplot2) # Data visualization
library(readr) # CSV file I/O, e.g. the read_csv function
library(moments)
library(gridExtra)
library(caret)
## Loading required package: lattice
library(corrplot)
set.seed(1234)
You can also embed plots, for example:
train <- read.csv("../input/train.csv")
These are detected as numeric but they are categorical variables:
train$MSSubClass <- as.factor(train$MSSubClass)
train$OverallQual <- as.factor(train$OverallQual)
train$OverallCond <- as.factor(train$OverallCond)
#train$YearBuilt <- as.factor(train$YearBuilt)
#train$YearRemodAdd <- as.factor(train$YearRemodAdd)
numeric_var <- names(train)[which(sapply(train, is.numeric))]
factor_var <- names(train)[!(names(train) %in% numeric_var)]
str(train)
## 'data.frame': 1460 obs. of 81 variables:
## $ Id : int 1 2 3 4 5 6 7 8 9 10 ...
## $ MSSubClass : Factor w/ 15 levels "20","30","40",..: 6 1 6 7 6 5 1 6 5 15 ...
## $ MSZoning : Factor w/ 5 levels "C (all)","FV",..: 4 4 4 4 4 4 4 4 5 4 ...
## $ LotFrontage : int 65 80 68 60 84 85 75 NA 51 50 ...
## $ LotArea : int 8450 9600 11250 9550 14260 14115 10084 10382 6120 7420 ...
## $ Street : Factor w/ 2 levels "Grvl","Pave": 2 2 2 2 2 2 2 2 2 2 ...
## $ Alley : Factor w/ 2 levels "Grvl","Pave": NA NA NA NA NA NA NA NA NA NA ...
## $ LotShape : Factor w/ 4 levels "IR1","IR2","IR3",..: 4 4 1 1 1 1 4 1 4 4 ...
## $ LandContour : Factor w/ 4 levels "Bnk","HLS","Low",..: 4 4 4 4 4 4 4 4 4 4 ...
## $ Utilities : Factor w/ 2 levels "AllPub","NoSeWa": 1 1 1 1 1 1 1 1 1 1 ...
## $ LotConfig : Factor w/ 5 levels "Corner","CulDSac",..: 5 3 5 1 3 5 5 1 5 1 ...
## $ LandSlope : Factor w/ 3 levels "Gtl","Mod","Sev": 1 1 1 1 1 1 1 1 1 1 ...
## $ Neighborhood : Factor w/ 25 levels "Blmngtn","Blueste",..: 6 25 6 7 14 12 21 17 18 4 ...
## $ Condition1 : Factor w/ 9 levels "Artery","Feedr",..: 3 2 3 3 3 3 3 5 1 1 ...
## $ Condition2 : Factor w/ 8 levels "Artery","Feedr",..: 3 3 3 3 3 3 3 3 3 1 ...
## $ BldgType : Factor w/ 5 levels "1Fam","2fmCon",..: 1 1 1 1 1 1 1 1 1 2 ...
## $ HouseStyle : Factor w/ 8 levels "1.5Fin","1.5Unf",..: 6 3 6 6 6 1 3 6 1 2 ...
## $ OverallQual : Factor w/ 10 levels "1","2","3","4",..: 7 6 7 7 8 5 8 7 7 5 ...
## $ OverallCond : Factor w/ 9 levels "1","2","3","4",..: 5 8 5 5 5 5 5 6 5 6 ...
## $ YearBuilt : int 2003 1976 2001 1915 2000 1993 2004 1973 1931 1939 ...
## $ YearRemodAdd : int 2003 1976 2002 1970 2000 1995 2005 1973 1950 1950 ...
## $ RoofStyle : Factor w/ 6 levels "Flat","Gable",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ RoofMatl : Factor w/ 8 levels "ClyTile","CompShg",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ Exterior1st : Factor w/ 15 levels "AsbShng","AsphShn",..: 13 9 13 14 13 13 13 7 4 9 ...
## $ Exterior2nd : Factor w/ 16 levels "AsbShng","AsphShn",..: 14 9 14 16 14 14 14 7 16 9 ...
## $ MasVnrType : Factor w/ 4 levels "BrkCmn","BrkFace",..: 2 3 2 3 2 3 4 4 3 3 ...
## $ MasVnrArea : int 196 0 162 0 350 0 186 240 0 0 ...
## $ ExterQual : Factor w/ 4 levels "Ex","Fa","Gd",..: 3 4 3 4 3 4 3 4 4 4 ...
## $ ExterCond : Factor w/ 5 levels "Ex","Fa","Gd",..: 5 5 5 5 5 5 5 5 5 5 ...
## $ Foundation : Factor w/ 6 levels "BrkTil","CBlock",..: 3 2 3 1 3 6 3 2 1 1 ...
## $ BsmtQual : Factor w/ 4 levels "Ex","Fa","Gd",..: 3 3 3 4 3 3 1 3 4 4 ...
## $ BsmtCond : Factor w/ 4 levels "Fa","Gd","Po",..: 4 4 4 2 4 4 4 4 4 4 ...
## $ BsmtExposure : Factor w/ 4 levels "Av","Gd","Mn",..: 4 2 3 4 1 4 1 3 4 4 ...
## $ BsmtFinType1 : Factor w/ 6 levels "ALQ","BLQ","GLQ",..: 3 1 3 1 3 3 3 1 6 3 ...
## $ BsmtFinSF1 : int 706 978 486 216 655 732 1369 859 0 851 ...
## $ BsmtFinType2 : Factor w/ 6 levels "ALQ","BLQ","GLQ",..: 6 6 6 6 6 6 6 2 6 6 ...
## $ BsmtFinSF2 : int 0 0 0 0 0 0 0 32 0 0 ...
## $ BsmtUnfSF : int 150 284 434 540 490 64 317 216 952 140 ...
## $ TotalBsmtSF : int 856 1262 920 756 1145 796 1686 1107 952 991 ...
## $ Heating : Factor w/ 6 levels "Floor","GasA",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ HeatingQC : Factor w/ 5 levels "Ex","Fa","Gd",..: 1 1 1 3 1 1 1 1 3 1 ...
## $ CentralAir : Factor w/ 2 levels "N","Y": 2 2 2 2 2 2 2 2 2 2 ...
## $ Electrical : Factor w/ 5 levels "FuseA","FuseF",..: 5 5 5 5 5 5 5 5 2 5 ...
## $ X1stFlrSF : int 856 1262 920 961 1145 796 1694 1107 1022 1077 ...
## $ X2ndFlrSF : int 854 0 866 756 1053 566 0 983 752 0 ...
## $ LowQualFinSF : int 0 0 0 0 0 0 0 0 0 0 ...
## $ GrLivArea : int 1710 1262 1786 1717 2198 1362 1694 2090 1774 1077 ...
## $ BsmtFullBath : int 1 0 1 1 1 1 1 1 0 1 ...
## $ BsmtHalfBath : int 0 1 0 0 0 0 0 0 0 0 ...
## $ FullBath : int 2 2 2 1 2 1 2 2 2 1 ...
## $ HalfBath : int 1 0 1 0 1 1 0 1 0 0 ...
## $ BedroomAbvGr : int 3 3 3 3 4 1 3 3 2 2 ...
## $ KitchenAbvGr : int 1 1 1 1 1 1 1 1 2 2 ...
## $ KitchenQual : Factor w/ 4 levels "Ex","Fa","Gd",..: 3 4 3 3 3 4 3 4 4 4 ...
## $ TotRmsAbvGrd : int 8 6 6 7 9 5 7 7 8 5 ...
## $ Functional : Factor w/ 7 levels "Maj1","Maj2",..: 7 7 7 7 7 7 7 7 3 7 ...
## $ Fireplaces : int 0 1 1 1 1 0 1 2 2 2 ...
## $ FireplaceQu : Factor w/ 5 levels "Ex","Fa","Gd",..: NA 5 5 3 5 NA 3 5 5 5 ...
## $ GarageType : Factor w/ 6 levels "2Types","Attchd",..: 2 2 2 6 2 2 2 2 6 2 ...
## $ GarageYrBlt : int 2003 1976 2001 1998 2000 1993 2004 1973 1931 1939 ...
## $ GarageFinish : Factor w/ 3 levels "Fin","RFn","Unf": 2 2 2 3 2 3 2 2 3 2 ...
## $ GarageCars : int 2 2 2 3 3 2 2 2 2 1 ...
## $ GarageArea : int 548 460 608 642 836 480 636 484 468 205 ...
## $ GarageQual : Factor w/ 5 levels "Ex","Fa","Gd",..: 5 5 5 5 5 5 5 5 2 3 ...
## $ GarageCond : Factor w/ 5 levels "Ex","Fa","Gd",..: 5 5 5 5 5 5 5 5 5 5 ...
## $ PavedDrive : Factor w/ 3 levels "N","P","Y": 3 3 3 3 3 3 3 3 3 3 ...
## $ WoodDeckSF : int 0 298 0 0 192 40 255 235 90 0 ...
## $ OpenPorchSF : int 61 0 42 35 84 30 57 204 0 4 ...
## $ EnclosedPorch: int 0 0 0 272 0 0 0 228 205 0 ...
## $ X3SsnPorch : int 0 0 0 0 0 320 0 0 0 0 ...
## $ ScreenPorch : int 0 0 0 0 0 0 0 0 0 0 ...
## $ PoolArea : int 0 0 0 0 0 0 0 0 0 0 ...
## $ PoolQC : Factor w/ 3 levels "Ex","Fa","Gd": NA NA NA NA NA NA NA NA NA NA ...
## $ Fence : Factor w/ 4 levels "GdPrv","GdWo",..: NA NA NA NA NA 3 NA NA NA NA ...
## $ MiscFeature : Factor w/ 4 levels "Gar2","Othr",..: NA NA NA NA NA 3 NA 3 NA NA ...
## $ MiscVal : int 0 0 0 0 0 700 0 350 0 0 ...
## $ MoSold : int 2 5 9 2 12 10 8 11 4 1 ...
## $ YrSold : int 2008 2007 2008 2006 2008 2009 2007 2009 2008 2008 ...
## $ SaleType : Factor w/ 9 levels "COD","Con","ConLD",..: 9 9 9 9 9 9 9 9 9 9 ...
## $ SaleCondition: Factor w/ 6 levels "Abnorml","AdjLand",..: 5 5 5 1 5 5 5 5 1 5 ...
## $ SalePrice : int 208500 181500 223500 140000 250000 143000 307000 200000 129900 118000 ...
There are 1460 data rows and 81 variables
summary(train)
## Id MSSubClass MSZoning LotFrontage
## Min. : 1.0 20 :536 C (all): 10 Min. : 21.00
## 1st Qu.: 365.8 60 :299 FV : 65 1st Qu.: 59.00
## Median : 730.5 50 :144 RH : 16 Median : 69.00
## Mean : 730.5 120 : 87 RL :1151 Mean : 70.05
## 3rd Qu.:1095.2 30 : 69 RM : 218 3rd Qu.: 80.00
## Max. :1460.0 160 : 63 Max. :313.00
## (Other):262 NA's :259
## LotArea Street Alley LotShape LandContour
## Min. : 1300 Grvl: 6 Grvl: 50 IR1:484 Bnk: 63
## 1st Qu.: 7554 Pave:1454 Pave: 41 IR2: 41 HLS: 50
## Median : 9478 NA's:1369 IR3: 10 Low: 36
## Mean : 10517 Reg:925 Lvl:1311
## 3rd Qu.: 11602
## Max. :215245
##
## Utilities LotConfig LandSlope Neighborhood Condition1
## AllPub:1459 Corner : 263 Gtl:1382 NAmes :225 Norm :1260
## NoSeWa: 1 CulDSac: 94 Mod: 65 CollgCr:150 Feedr : 81
## FR2 : 47 Sev: 13 OldTown:113 Artery : 48
## FR3 : 4 Edwards:100 RRAn : 26
## Inside :1052 Somerst: 86 PosN : 19
## Gilbert: 79 RRAe : 11
## (Other):707 (Other): 15
## Condition2 BldgType HouseStyle OverallQual OverallCond
## Norm :1445 1Fam :1220 1Story :726 5 :397 5 :821
## Feedr : 6 2fmCon: 31 2Story :445 6 :374 6 :252
## Artery : 2 Duplex: 52 1.5Fin :154 7 :319 7 :205
## PosN : 2 Twnhs : 43 SLvl : 65 8 :168 8 : 72
## RRNn : 2 TwnhsE: 114 SFoyer : 37 4 :116 4 : 57
## PosA : 1 1.5Unf : 14 9 : 43 3 : 25
## (Other): 2 (Other): 19 (Other): 43 (Other): 28
## YearBuilt YearRemodAdd RoofStyle RoofMatl Exterior1st
## Min. :1872 Min. :1950 Flat : 13 CompShg:1434 VinylSd:515
## 1st Qu.:1954 1st Qu.:1967 Gable :1141 Tar&Grv: 11 HdBoard:222
## Median :1973 Median :1994 Gambrel: 11 WdShngl: 6 MetalSd:220
## Mean :1971 Mean :1985 Hip : 286 WdShake: 5 Wd Sdng:206
## 3rd Qu.:2000 3rd Qu.:2004 Mansard: 7 ClyTile: 1 Plywood:108
## Max. :2010 Max. :2010 Shed : 2 Membran: 1 CemntBd: 61
## (Other): 2 (Other):128
## Exterior2nd MasVnrType MasVnrArea ExterQual ExterCond
## VinylSd:504 BrkCmn : 15 Min. : 0.0 Ex: 52 Ex: 3
## MetalSd:214 BrkFace:445 1st Qu.: 0.0 Fa: 14 Fa: 28
## HdBoard:207 None :864 Median : 0.0 Gd:488 Gd: 146
## Wd Sdng:197 Stone :128 Mean : 103.7 TA:906 Po: 1
## Plywood:142 NA's : 8 3rd Qu.: 166.0 TA:1282
## CmentBd: 60 Max. :1600.0
## (Other):136 NA's :8
## Foundation BsmtQual BsmtCond BsmtExposure BsmtFinType1
## BrkTil:146 Ex :121 Fa : 45 Av :221 ALQ :220
## CBlock:634 Fa : 35 Gd : 65 Gd :134 BLQ :148
## PConc :647 Gd :618 Po : 2 Mn :114 GLQ :418
## Slab : 24 TA :649 TA :1311 No :953 LwQ : 74
## Stone : 6 NA's: 37 NA's: 37 NA's: 38 Rec :133
## Wood : 3 Unf :430
## NA's: 37
## BsmtFinSF1 BsmtFinType2 BsmtFinSF2 BsmtUnfSF
## Min. : 0.0 ALQ : 19 Min. : 0.00 Min. : 0.0
## 1st Qu.: 0.0 BLQ : 33 1st Qu.: 0.00 1st Qu.: 223.0
## Median : 383.5 GLQ : 14 Median : 0.00 Median : 477.5
## Mean : 443.6 LwQ : 46 Mean : 46.55 Mean : 567.2
## 3rd Qu.: 712.2 Rec : 54 3rd Qu.: 0.00 3rd Qu.: 808.0
## Max. :5644.0 Unf :1256 Max. :1474.00 Max. :2336.0
## NA's: 38
## TotalBsmtSF Heating HeatingQC CentralAir Electrical
## Min. : 0.0 Floor: 1 Ex:741 N: 95 FuseA: 94
## 1st Qu.: 795.8 GasA :1428 Fa: 49 Y:1365 FuseF: 27
## Median : 991.5 GasW : 18 Gd:241 FuseP: 3
## Mean :1057.4 Grav : 7 Po: 1 Mix : 1
## 3rd Qu.:1298.2 OthW : 2 TA:428 SBrkr:1334
## Max. :6110.0 Wall : 4 NA's : 1
##
## X1stFlrSF X2ndFlrSF LowQualFinSF GrLivArea
## Min. : 334 Min. : 0 Min. : 0.000 Min. : 334
## 1st Qu.: 882 1st Qu.: 0 1st Qu.: 0.000 1st Qu.:1130
## Median :1087 Median : 0 Median : 0.000 Median :1464
## Mean :1163 Mean : 347 Mean : 5.845 Mean :1515
## 3rd Qu.:1391 3rd Qu.: 728 3rd Qu.: 0.000 3rd Qu.:1777
## Max. :4692 Max. :2065 Max. :572.000 Max. :5642
##
## BsmtFullBath BsmtHalfBath FullBath HalfBath
## Min. :0.0000 Min. :0.00000 Min. :0.000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.00000 1st Qu.:1.000 1st Qu.:0.0000
## Median :0.0000 Median :0.00000 Median :2.000 Median :0.0000
## Mean :0.4253 Mean :0.05753 Mean :1.565 Mean :0.3829
## 3rd Qu.:1.0000 3rd Qu.:0.00000 3rd Qu.:2.000 3rd Qu.:1.0000
## Max. :3.0000 Max. :2.00000 Max. :3.000 Max. :2.0000
##
## BedroomAbvGr KitchenAbvGr KitchenQual TotRmsAbvGrd Functional
## Min. :0.000 Min. :0.000 Ex:100 Min. : 2.000 Maj1: 14
## 1st Qu.:2.000 1st Qu.:1.000 Fa: 39 1st Qu.: 5.000 Maj2: 5
## Median :3.000 Median :1.000 Gd:586 Median : 6.000 Min1: 31
## Mean :2.866 Mean :1.047 TA:735 Mean : 6.518 Min2: 34
## 3rd Qu.:3.000 3rd Qu.:1.000 3rd Qu.: 7.000 Mod : 15
## Max. :8.000 Max. :3.000 Max. :14.000 Sev : 1
## Typ :1360
## Fireplaces FireplaceQu GarageType GarageYrBlt GarageFinish
## Min. :0.000 Ex : 24 2Types : 6 Min. :1900 Fin :352
## 1st Qu.:0.000 Fa : 33 Attchd :870 1st Qu.:1961 RFn :422
## Median :1.000 Gd :380 Basment: 19 Median :1980 Unf :605
## Mean :0.613 Po : 20 BuiltIn: 88 Mean :1979 NA's: 81
## 3rd Qu.:1.000 TA :313 CarPort: 9 3rd Qu.:2002
## Max. :3.000 NA's:690 Detchd :387 Max. :2010
## NA's : 81 NA's :81
## GarageCars GarageArea GarageQual GarageCond PavedDrive
## Min. :0.000 Min. : 0.0 Ex : 3 Ex : 2 N: 90
## 1st Qu.:1.000 1st Qu.: 334.5 Fa : 48 Fa : 35 P: 30
## Median :2.000 Median : 480.0 Gd : 14 Gd : 9 Y:1340
## Mean :1.767 Mean : 473.0 Po : 3 Po : 7
## 3rd Qu.:2.000 3rd Qu.: 576.0 TA :1311 TA :1326
## Max. :4.000 Max. :1418.0 NA's: 81 NA's: 81
##
## WoodDeckSF OpenPorchSF EnclosedPorch X3SsnPorch
## Min. : 0.00 Min. : 0.00 Min. : 0.00 Min. : 0.00
## 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.00
## Median : 0.00 Median : 25.00 Median : 0.00 Median : 0.00
## Mean : 94.24 Mean : 46.66 Mean : 21.95 Mean : 3.41
## 3rd Qu.:168.00 3rd Qu.: 68.00 3rd Qu.: 0.00 3rd Qu.: 0.00
## Max. :857.00 Max. :547.00 Max. :552.00 Max. :508.00
##
## ScreenPorch PoolArea PoolQC Fence MiscFeature
## Min. : 0.00 Min. : 0.000 Ex : 2 GdPrv: 59 Gar2: 2
## 1st Qu.: 0.00 1st Qu.: 0.000 Fa : 2 GdWo : 54 Othr: 2
## Median : 0.00 Median : 0.000 Gd : 3 MnPrv: 157 Shed: 49
## Mean : 15.06 Mean : 2.759 NA's:1453 MnWw : 11 TenC: 1
## 3rd Qu.: 0.00 3rd Qu.: 0.000 NA's :1179 NA's:1406
## Max. :480.00 Max. :738.000
##
## MiscVal MoSold YrSold SaleType
## Min. : 0.00 Min. : 1.000 Min. :2006 WD :1267
## 1st Qu.: 0.00 1st Qu.: 5.000 1st Qu.:2007 New : 122
## Median : 0.00 Median : 6.000 Median :2008 COD : 43
## Mean : 43.49 Mean : 6.322 Mean :2008 ConLD : 9
## 3rd Qu.: 0.00 3rd Qu.: 8.000 3rd Qu.:2009 ConLI : 5
## Max. :15500.00 Max. :12.000 Max. :2010 ConLw : 5
## (Other): 9
## SaleCondition SalePrice
## Abnorml: 101 Min. : 34900
## AdjLand: 4 1st Qu.:129975
## Alloca : 12 Median :163000
## Family : 20 Mean :180921
## Normal :1198 3rd Qu.:214000
## Partial: 125 Max. :755000
##
we can see some NA values in variables:
which(apply(train, 2, function(x) any(is.na(x)))==TRUE)
## LotFrontage Alley MasVnrType MasVnrArea BsmtQual
## 4 7 26 27 31
## BsmtCond BsmtExposure BsmtFinType1 BsmtFinType2 Electrical
## 32 33 34 36 43
## FireplaceQu GarageType GarageYrBlt GarageFinish GarageQual
## 58 59 60 61 64
## GarageCond PoolQC Fence MiscFeature
## 65 73 74 75
all_vars_napercent <- apply(train, 2, function(x) round(sum(is.na(x))/nrow(train), 3))
naVars <- names(train)[which(apply(train, 2, function(x) round(sum(is.na(x))/nrow(train), 3))>0)]
sort(all_vars_napercent[naVars], decreasing = TRUE)
## PoolQC MiscFeature Alley Fence FireplaceQu
## 0.995 0.963 0.938 0.808 0.473
## LotFrontage GarageType GarageYrBlt GarageFinish GarageQual
## 0.177 0.055 0.055 0.055 0.055
## GarageCond BsmtExposure BsmtFinType2 BsmtQual BsmtCond
## 0.055 0.026 0.026 0.025 0.025
## BsmtFinType1 MasVnrType MasVnrArea Electrical
## 0.025 0.005 0.005 0.001
Some variables like PoolQC , MiscFeature, Alley, Fence has a very hight ratio of NA values.
We can see that all of the 1460 row have one ore more NA values:
sum(apply(train, 1 , function(x) sum(is.na(x))) == 0)
## [1] 0
Alternately
sum(complete.cases(train))
## [1] 0
Although only 5.8895654% of the values of the data are missing values
We can check this and get more detailed information using specialized imputation packages like MICE
library(VIM)
## Loading required package: colorspace
## Loading required package: grid
## Loading required package: data.table
## VIM is ready to use.
## Since version 4.0.0 the GUI is in its own package VIMGUI.
##
## Please use the package to use the new (and old) GUI.
## Suggestions and bug-reports can be submitted at: https://github.com/alexkowa/VIM/issues
##
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
##
## sleep
mice_plot <- aggr(train, col=c('navyblue','yellow'),
numbers=TRUE, sortVars=TRUE,
labels=names(train), cex.axis=.7,
gap=3, ylab=c("Missing data","Pattern"))
## Warning in plot.aggr(res, ...): not enough vertical space to display
## frequencies (too many combinations)
##
## Variables sorted by number of missings:
## Variable Count
## PoolQC 0.9952054795
## MiscFeature 0.9630136986
## Alley 0.9376712329
## Fence 0.8075342466
## FireplaceQu 0.4726027397
## LotFrontage 0.1773972603
## GarageType 0.0554794521
## GarageYrBlt 0.0554794521
## GarageFinish 0.0554794521
## GarageQual 0.0554794521
## GarageCond 0.0554794521
## BsmtExposure 0.0260273973
## BsmtFinType2 0.0260273973
## BsmtQual 0.0253424658
## BsmtCond 0.0253424658
## BsmtFinType1 0.0253424658
## MasVnrType 0.0054794521
## MasVnrArea 0.0054794521
## Electrical 0.0006849315
## Id 0.0000000000
## MSSubClass 0.0000000000
## MSZoning 0.0000000000
## LotArea 0.0000000000
## Street 0.0000000000
## LotShape 0.0000000000
## LandContour 0.0000000000
## Utilities 0.0000000000
## LotConfig 0.0000000000
## LandSlope 0.0000000000
## Neighborhood 0.0000000000
## Condition1 0.0000000000
## Condition2 0.0000000000
## BldgType 0.0000000000
## HouseStyle 0.0000000000
## OverallQual 0.0000000000
## OverallCond 0.0000000000
## YearBuilt 0.0000000000
## YearRemodAdd 0.0000000000
## RoofStyle 0.0000000000
## RoofMatl 0.0000000000
## Exterior1st 0.0000000000
## Exterior2nd 0.0000000000
## ExterQual 0.0000000000
## ExterCond 0.0000000000
## Foundation 0.0000000000
## BsmtFinSF1 0.0000000000
## BsmtFinSF2 0.0000000000
## BsmtUnfSF 0.0000000000
## TotalBsmtSF 0.0000000000
## Heating 0.0000000000
## HeatingQC 0.0000000000
## CentralAir 0.0000000000
## X1stFlrSF 0.0000000000
## X2ndFlrSF 0.0000000000
## LowQualFinSF 0.0000000000
## GrLivArea 0.0000000000
## BsmtFullBath 0.0000000000
## BsmtHalfBath 0.0000000000
## FullBath 0.0000000000
## HalfBath 0.0000000000
## BedroomAbvGr 0.0000000000
## KitchenAbvGr 0.0000000000
## KitchenQual 0.0000000000
## TotRmsAbvGrd 0.0000000000
## Functional 0.0000000000
## Fireplaces 0.0000000000
## GarageCars 0.0000000000
## GarageArea 0.0000000000
## PavedDrive 0.0000000000
## WoodDeckSF 0.0000000000
## OpenPorchSF 0.0000000000
## EnclosedPorch 0.0000000000
## X3SsnPorch 0.0000000000
## ScreenPorch 0.0000000000
## PoolArea 0.0000000000
## MiscVal 0.0000000000
## MoSold 0.0000000000
## YrSold 0.0000000000
## SaleType 0.0000000000
## SaleCondition 0.0000000000
## SalePrice 0.0000000000
If we want to use some machine learning models like linear regression we will need to impute values for these NA
Let’s see what variables have small or zero variance being very conservative with this:
vars_variance <- nearZeroVar(train, saveMetrics = TRUE, freqCut = 99/1, uniqueCut = 1)
vars_variance[vars_variance[,"zeroVar"] + vars_variance[,"nzv"] > 0, ]
## freqRatio percentUnique zeroVar nzv
## Street 242.3333 0.1369863 FALSE TRUE
## Utilities 1459.0000 0.1369863 FALSE TRUE
## Condition2 240.8333 0.5479452 FALSE TRUE
## RoofMatl 130.3636 0.5479452 FALSE TRUE
## PoolArea 1453.0000 0.5479452 FALSE TRUE
There aren’t any variable with zero variance. We will have to examine in detail these variables near zero variance to see if we can eliminate some of them.
The outcome variable to predict is SalePrice. We will take a look to the values of this variable in detail for the training data:
summary(train$SalePrice)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 34900 130000 163000 180900 214000 755000
boxplot(train$SalePrice)
We can see SalePrice is right skewed and don’t follow a normal distribution
skewness(train$SalePrice)
## [1] 1.880941
shapiro.test(train$SalePrice)
##
## Shapiro-Wilk normality test
##
## data: train$SalePrice
## W = 0.86967, p-value < 2.2e-16
qqnorm(train$SalePrice)
qqline(train$SalePrice, col = 2)
We can try to check what happens with a log transformation of SalePrice:
logSalePrice <- log(train$SalePrice)
hist(logSalePrice)
skewness(logSalePrice)
## [1] 0.1212104
shapiro.test(logSalePrice)
##
## Shapiro-Wilk normality test
##
## data: logSalePrice
## W = 0.99121, p-value = 1.149e-07
qqnorm(logSalePrice)
qqline(logSalePrice, col = 2)
It looks like it follows a more normal distribution but it has some right skew too, and its not completely normal as shapiro test show: if we consider a 0.05 level of significance we must reject the null hipotesys which consideres the data follows a normal distribution.
Lets take a look to categorical variables
plotHist <- function(data_in, i) {
data <- data.frame(x=data_in[[i]])
p <- ggplot(data=data, aes(x=factor(x))) + stat_count() + xlab(colnames(data_in)[i]) + theme_light() +
theme(axis.text.x = element_text(angle = 90, hjust =1))
return (p)
}
doPlots <- function(data_in, fun, ii, ncol=3) {
pp <- list()
for (i in ii) {
p <- fun(data_in=data_in, i=i)
pp <- c(pp, list(p))
}
do.call("grid.arrange", c(pp, ncol=ncol))
}
plotDen <- function(data_in, i){
data <- data.frame(x=data_in[[i]], SalePrice = data_in$SalePrice)
p <- ggplot(data= data) + geom_line(aes(x = x), stat = 'density', size = 1,alpha = 1.0) +
xlab(paste0((colnames(data_in)[i]), '\n', 'Skewness: ',round(skewness(data_in[[i]], na.rm = TRUE), 2))) + theme_light()
return(p)
}
doPlots(train[,factor_var], fun = plotHist, ii = 1:4, ncol = 2)
doPlots(train[,factor_var], fun = plotHist, ii = 5:9, ncol = 2)
doPlots(train[,factor_var], fun = plotHist, ii = 10:14, ncol = 2)
doPlots(train[,factor_var], fun = plotHist, ii = 15:20, ncol = 2)
doPlots(train[,factor_var], fun = plotHist, ii = 21:28, ncol = 2)
doPlots(train[,factor_var], fun = plotHist, ii = 29:38, ncol = 2)
doPlots(train[,factor_var], fun = plotHist, ii = 39:46, ncol = 2)
Mostramos las gráficas de densidad para las variables continuas
doPlots(train[,numeric_var], fun = plotDen, ii = 2:9, ncol = 2)
## Warning: Removed 259 rows containing non-finite values (stat_density).
## Warning: Removed 8 rows containing non-finite values (stat_density).
doPlots(train[,numeric_var], fun = plotDen, ii = 10:17, ncol = 2)
doPlots(train[,numeric_var], fun = plotDen, ii = 18:25, ncol = 2)
## Warning: Removed 81 rows containing non-finite values (stat_density).
doPlots(train[,numeric_var], fun = plotDen, ii = 26:35, ncol = 2)
We take a look to the correlation of the numerical variables (we omit the Id variable too):
cont_vars <- numeric_var[2:length(numeric_var)]
correlations <- cor(na.omit(train[, cont_vars]))
corrplot(correlations, method="square")
Let’s see what variables are more correlated:
high_correlations_index <- apply(correlations, 1, function(x) sum( abs(x) > 0.3 ) > 1)
high_correlations <- correlations[high_correlations_index, high_correlations_index]
corrplot(high_correlations, method="square")
The hight correlated vars are these:
cont_vars[high_correlations_index]
## [1] "LotFrontage" "LotArea" "YearBuilt" "YearRemodAdd"
## [5] "MasVnrArea" "BsmtFinSF1" "BsmtUnfSF" "TotalBsmtSF"
## [9] "X1stFlrSF" "X2ndFlrSF" "GrLivArea" "BsmtFullBath"
## [13] "FullBath" "HalfBath" "BedroomAbvGr" "TotRmsAbvGrd"
## [17] "Fireplaces" "GarageYrBlt" "GarageCars" "GarageArea"
## [21] "WoodDeckSF" "OpenPorchSF" "EnclosedPorch" "SalePrice"